In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
from statsmodels.sandbox.stats.multicomp import multipletests
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline
In [2]:
mf = pd.read_csv('../data/mapping_cleaned_MrOS.txt', sep='\t', dtype=str, index_col='#SampleID')
bt = pd.read_csv('../data/biomtable.txt', sep='\t', dtype=str, index_col='#OTU ID')
bt = bt.transpose()
In [3]:
print(mf.shape, bt.shape) # bt has an additional row of 'taxonomy'
In [4]:
mf.head()
Out[4]:
In [5]:
bt.head()
Out[5]:
In [6]:
dat = pd.merge(mf, bt, left_index=True, right_index=True)
In [7]:
dat.shape
Out[7]:
In [8]:
dat.head()
Out[8]:
In [9]:
vars_vd = np.array(['OHVD3', 'OHV1D3', 'OHV24D3', 'ratio_activation', 'ratio_catabolism'])
dat[vars_vd] = dat[vars_vd].apply(pd.to_numeric, errors='coerce')
In [10]:
dat[vars_vd].describe()
Out[10]:
In [11]:
otu_cols = dat.columns[mf.shape[1]:dat.shape[1]]
len(otu_cols)
Out[11]:
In [12]:
results= []
i = 0
for j in range(len(otu_cols)):
tmp = dat[[vars_vd[i], otu_cols[j]]].dropna(axis=0, how='any')
rho, pval = spearmanr(tmp[vars_vd[i]], tmp[otu_cols[j]])
tax = bt.loc['taxonomy'][otu_cols[j]]
results.append([vars_vd[i], otu_cols[j], tax, rho, pval])
# output table
results = pd.DataFrame(results, columns=['vars', 'otu',
'taxonomy', 'rho', 'pval']).dropna(axis=0, how='any')
results['fdr pval'] = multipletests(results['pval'], method = 'fdr_bh')[1]
results = results.sort_values(['fdr pval'], ascending=True)
# specific bacteria
index = results.loc[results['fdr pval'] <= 0.05].index
for i in range(len(index)):
print(results.taxonomy[index[i]], results['fdr pval'][index[i]])
In [13]:
# check
results.head(5)
Out[13]:
In [14]:
results_spearman_OHV1D3 = []
i = 1
for j in range(len(otu_cols)):
tmp = dat[[vars_vd[i], otu_cols[j]]].dropna(axis=0, how='any')
rho, pval = spearmanr(tmp[vars_vd[i]], tmp[otu_cols[j]])
tax = bt.loc['taxonomy'][otu_cols[j]]
results_spearman_OHV1D3.append([vars_vd[i], otu_cols[j], tax, rho, pval])
# output table
results_spearman_OHV1D3 = pd.DataFrame(results_spearman_OHV1D3, columns=['vars', 'otu', 'taxonomy', 'rho',
'pval']).dropna(axis=0, how='any')
results_spearman_OHV1D3['fdr pval'] = multipletests(results_spearman_OHV1D3['pval'], method = 'fdr_bh')[1]
results_spearman_OHV1D3 = results_spearman_OHV1D3.sort_values(['fdr pval'], ascending=True)
# specific bacteria
index_OHV1D3 = results_spearman_OHV1D3.loc[results_spearman_OHV1D3['fdr pval'] <= 0.05].index
for i in range(len(index_OHV1D3)):
print(results_spearman_OHV1D3.taxonomy[index_OHV1D3[i]],
results_spearman_OHV1D3['rho'][index_OHV1D3[i]],
results_spearman_OHV1D3['fdr pval'][index_OHV1D3[i]])
In [15]:
# check
results_spearman_OHV1D3.head(10)
Out[15]:
In [16]:
results= []
i = 2
for j in range(len(otu_cols)):
tmp = dat[[vars_vd[i], otu_cols[j]]].dropna(axis=0, how='any')
rho, pval = spearmanr(tmp[vars_vd[i]], tmp[otu_cols[j]])
tax = bt.loc['taxonomy'][otu_cols[j]]
results.append([vars_vd[i], otu_cols[j], tax, rho, pval])
# output table
results = pd.DataFrame(results, columns=['vars', 'otu',
'taxonomy', 'rho', 'pval']).dropna(axis=0, how='any')
results['fdr pval'] = multipletests(results['pval'], method = 'fdr_bh')[1]
results = results.sort_values(['fdr pval'], ascending=True)
# specific bacteria
index = results.loc[results['fdr pval'] <= 0.05].index
for i in range(len(index)):
print(results.taxonomy[index[i]],
results['rho'][index[i]],
results['fdr pval'][index[i]])
In [17]:
# check
results.head(3)
Out[17]:
In [18]:
results= []
i = 3
for j in range(len(otu_cols)):
tmp = dat[[vars_vd[i], otu_cols[j]]].dropna(axis=0, how='any')
rho, pval = spearmanr(tmp[vars_vd[i]], tmp[otu_cols[j]])
tax = bt.loc['taxonomy'][otu_cols[j]]
results.append([vars_vd[i], otu_cols[j], tax, rho, pval])
# output table
results = pd.DataFrame(results, columns=['vars', 'otu',
'taxonomy', 'rho', 'pval']).dropna(axis=0, how='any')
results['fdr pval'] = multipletests(results['pval'], method = 'fdr_bh')[1]
results = results.sort_values(['fdr pval'], ascending=True)
# specific bacteria
index = results.loc[results['fdr pval'] <= 0.05].index
for i in range(len(index)):
print(results.taxonomy[index[i]],
results['rho'][index[i]],
results['fdr pval'][index[i]])
In [19]:
# store result for future check with lasso
results_spearman_activation = results
In [20]:
# check
results.head(10)
Out[20]:
In [21]:
results= []
i = 4
for j in range(len(otu_cols)):
tmp = dat[[vars_vd[i], otu_cols[j]]].dropna(axis=0, how='any')
rho, pval = spearmanr(tmp[vars_vd[i]], tmp[otu_cols[j]])
tax = bt.loc['taxonomy'][otu_cols[j]]
results.append([vars_vd[i], otu_cols[j], tax, rho, pval])
# output table
results = pd.DataFrame(results, columns=['vars', 'otu',
'taxonomy', 'rho', 'pval']).dropna(axis=0, how='any')
results['fdr pval'] = multipletests(results['pval'], method = 'fdr_bh')[1]
results = results.sort_values(['fdr pval'], ascending=True)
# specific bacteria
index = results.loc[results['fdr pval'] <= 0.05].index
for i in range(len(index)):
print(results.taxonomy[index[i]],
results['fdr pval'][index[i]])
In [22]:
# check
results.head(3)
Out[22]:
In [23]:
tmp = dat[np.append(vars_vd, otu_cols)].dropna()
print(tmp.shape)
tmp.head()
Out[23]:
In [24]:
X = tmp[otu_cols]
y = tmp[vars_vd[0]]
# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(X, y, test_size=.3, random_state=123)
# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=True).fit(pred_train,tar_train)
In [25]:
np.sum(model.coef_)
Out[25]:
In [26]:
X = tmp[otu_cols]
y = tmp[vars_vd[0]]
# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(X, y, test_size=.3, random_state=123)
# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=True).fit(pred_train,tar_train)
In [27]:
np.sum(model.coef_)
Out[27]:
In [28]:
X = tmp[otu_cols]
y = tmp[vars_vd[0]]
# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(X, y, test_size=.3, random_state=123)
# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=True).fit(pred_train,tar_train)
In [29]:
np.sum(model.coef_)
Out[29]:
In [30]:
X = tmp[otu_cols]
# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(X, y, test_size=.3, random_state=123)
# specify the lasso regression model
model=LassoLarsCV(cv=20, precompute=True).fit(pred_train,tar_train)
In [31]:
np.sum(model.coef_)
Out[31]:
In [32]:
reg = dict(zip(X.columns, model.coef_))
reg = pd.DataFrame.from_dict(reg, orient='index').rename(
columns={0: 'lasso coef'})
reg['taxonomy'] = bt.loc['taxonomy'][reg.index]
In [33]:
subset = reg.loc[reg['lasso coef'] != 0]
print(subset.shape)
print(subset.taxonomy.values, subset['lasso coef'].values)
In [34]:
# store result for future need
subset_activation = subset
subset_activation
Out[34]:
In [35]:
# check whether the same as in spearman result
same = results_spearman_activation.loc[results_spearman_activation['otu'].isin (subset_activation.index)]
print(same.taxonomy.values)
print(same['fdr pval'])
In [36]:
results_spearman_activation.loc[results['fdr pval'] <= 0.05].index
Out[36]:
In [48]:
# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')
#plt.savefig('../figures/lasso_coef.png', bbox_inches='tight')
Out[48]:
In [47]:
# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
#plt.savefig('../figures/lasso_mse.png', bbox_inches='tight')
Out[47]:
In [49]:
# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)
In [50]:
# R-square from training and test data
rsquared_train=model.score(pred_train,tar_train)
rsquared_test=model.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
In [51]:
X = tmp[otu_cols]
y = tmp[vars_vd[4]]
# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(X, y, test_size=.3, random_state=123)
# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=True).fit(pred_train,tar_train)
In [52]:
np.sum(model.coef_)
Out[52]:
In [53]:
# check
reg = dict(zip(X.columns, model.coef_))
reg = pd.DataFrame.from_dict(reg, orient='index').rename(
columns={0: 'lasso coef'})
reg['taxonomy'] = bt.loc['taxonomy'][reg.index]
subset = reg.loc[reg['lasso coef'] != 0]
print(subset.shape)
In [ ]:
In [ ]:
In [ ]:
In [44]:
#### previous scratch on lasso
In [45]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
In [46]:
small= tmp[np.append(vars_vd[1], otu_cols)].dropna(axis=0, how='any')
X = scaler.fit_transform(small[otu_cols])
Y = small[vars_vd[1]]
In [ ]:
# drop all missing values
tmp = dat[np.append(vars_vd, otu_cols)]
tmp.head()
In [ ]:
scaler = StandardScaler()
small= tmp[np.append(vars_vd[0], otu_cols)].dropna(axis=0, how='any')
X = scaler.fit_transform(small[otu_cols])
Y = small[vars_vd[0]]
names = otu_cols
In [ ]:
# Create a function called lasso,
def lasso(alphas):
'''
Takes in a list of alphas. Outputs a dataframe containing the coefficients of lasso regressions from each alpha.
'''
# Create an empty data frame
df = pd.DataFrame()
# Create a column of feature names
df['OTU names'] = names
# For each alpha value in the list of alpha values,
for alpha in alphas:
# Create a lasso regression with that alpha value,
lasso = Lasso(alpha=alpha)
# Fit the lasso regression
lasso.fit(X, Y)
# Create a column name for that alpha value
column_name = 'Alpha = %f' % alpha
# Create a column of coefficient values
df[column_name] = lasso.coef_
# Return the datafram
return df
In [ ]:
table = lasso([0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5])
In [ ]:
table.head()
In [ ]:
table.loc[table['Alpha = 0.500000'] != 0]
In [ ]:
list = np.array([63, 131, 188, 237, 384, 505, 2116, 2545, 3484, 3598])
for i in range(len(list)):
print(bt.loc['taxonomy'][list[i]])
In [ ]:
In [ ]:
scaler = StandardScaler()
small= tmp[np.append(vars_vd[1], otu_cols)].dropna(axis=0, how='any')
X = scaler.fit_transform(small[otu_cols])
Y = small[vars_vd[1]]
names = otu_cols
In [ ]:
table = lasso([0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5])
In [ ]:
table.loc[table['Alpha = 3.000000'] != 0]
In [ ]:
bt.loc['taxonomy'][3323]
In [ ]:
In [ ]:
scaler = StandardScaler()
small= tmp[np.append(vars_vd[2], otu_cols)].dropna(axis=0, how='any')
X = scaler.fit_transform(small[otu_cols])
Y = small[vars_vd[2]]
names = otu_cols
table = lasso([0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5])
In [ ]:
table.loc[table['Alpha = 2.000000'] != 0]
In [ ]:
In [ ]:
scaler = StandardScaler()
small= tmp[np.append(vars_vd[3], otu_cols)].dropna(axis=0, how='any')
X = scaler.fit_transform(small[otu_cols])
Y = small[vars_vd[3]]
names = otu_cols
table = lasso([0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5])
In [ ]:
table.loc[table['Alpha = 2.000000'] != 0]
In [ ]:
In [ ]:
scaler = StandardScaler()
small= tmp[np.append(vars_vd[3], otu_cols)].dropna(axis=0, how='any')
X = scaler.fit_transform(small[otu_cols])
Y = small[vars_vd[3]]
names = otu_cols
table = lasso([0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5])
In [ ]:
table.loc[table['Alpha = 1.000000'] != 0]
In [ ]:
In [ ]:
scaler = StandardScaler()
small= tmp[np.append(vars_vd[4], otu_cols)].dropna(axis=0, how='any')
X = scaler.fit_transform(small[otu_cols])
Y = small[vars_vd[4]]
names = otu_cols
table = lasso([0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5])
In [ ]:
table.loc[table['Alpha = 1.000000'] != 0]
In [ ]:
In [ ]:
clf = Lasso(alpha=0.5)
clf.fit(X, Y)
Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
normalize=False, positive=False, precompute=False, random_state=None,
selection='cyclic', tol=0.0001, warm_start=False)
In [ ]:
print(clf.coef_)
In [ ]:
sum(np.abs(clf.coef_))
In [ ]:
print(clf.intercept_)
In [ ]: